{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# importing libraries\n",
    "import numpy as np\n",
    "import time\n",
    "import math\n",
    "import pandas as pd\n",
    "pd.options.display.max_columns = None\n",
    "pd.options.display.max_rows = 200\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "from catboost import CatBoostClassifier\n",
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = 'all'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset shape: (17996, 17)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Artist Name</th>\n",
       "      <th>Track Name</th>\n",
       "      <th>Popularity</th>\n",
       "      <th>danceability</th>\n",
       "      <th>energy</th>\n",
       "      <th>key</th>\n",
       "      <th>loudness</th>\n",
       "      <th>mode</th>\n",
       "      <th>speechiness</th>\n",
       "      <th>acousticness</th>\n",
       "      <th>instrumentalness</th>\n",
       "      <th>liveness</th>\n",
       "      <th>valence</th>\n",
       "      <th>tempo</th>\n",
       "      <th>duration_in min/ms</th>\n",
       "      <th>time_signature</th>\n",
       "      <th>Class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Bruno Mars</td>\n",
       "      <td>That's What I Like (feat. Gucci Mane)</td>\n",
       "      <td>60.0</td>\n",
       "      <td>0.854</td>\n",
       "      <td>0.564</td>\n",
       "      <td>1.0</td>\n",
       "      <td>-4.964</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0485</td>\n",
       "      <td>0.017100</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0849</td>\n",
       "      <td>0.8990</td>\n",
       "      <td>134.071</td>\n",
       "      <td>234596.0</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Boston</td>\n",
       "      <td>Hitch a Ride</td>\n",
       "      <td>54.0</td>\n",
       "      <td>0.382</td>\n",
       "      <td>0.814</td>\n",
       "      <td>3.0</td>\n",
       "      <td>-7.230</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0406</td>\n",
       "      <td>0.001100</td>\n",
       "      <td>0.004010</td>\n",
       "      <td>0.1010</td>\n",
       "      <td>0.5690</td>\n",
       "      <td>116.454</td>\n",
       "      <td>251733.0</td>\n",
       "      <td>4</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>The Raincoats</td>\n",
       "      <td>No Side to Fall In</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0.434</td>\n",
       "      <td>0.614</td>\n",
       "      <td>6.0</td>\n",
       "      <td>-8.334</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0525</td>\n",
       "      <td>0.486000</td>\n",
       "      <td>0.000196</td>\n",
       "      <td>0.3940</td>\n",
       "      <td>0.7870</td>\n",
       "      <td>147.681</td>\n",
       "      <td>109667.0</td>\n",
       "      <td>4</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Deno</td>\n",
       "      <td>Lingo (feat. J.I &amp; Chunkz)</td>\n",
       "      <td>66.0</td>\n",
       "      <td>0.853</td>\n",
       "      <td>0.597</td>\n",
       "      <td>10.0</td>\n",
       "      <td>-6.528</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0555</td>\n",
       "      <td>0.021200</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.1220</td>\n",
       "      <td>0.5690</td>\n",
       "      <td>107.033</td>\n",
       "      <td>173968.0</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Red Hot Chili Peppers</td>\n",
       "      <td>Nobody Weird Like Me - Remastered</td>\n",
       "      <td>53.0</td>\n",
       "      <td>0.167</td>\n",
       "      <td>0.975</td>\n",
       "      <td>2.0</td>\n",
       "      <td>-4.279</td>\n",
       "      <td>1</td>\n",
       "      <td>0.2160</td>\n",
       "      <td>0.000169</td>\n",
       "      <td>0.016100</td>\n",
       "      <td>0.1720</td>\n",
       "      <td>0.0918</td>\n",
       "      <td>199.060</td>\n",
       "      <td>229960.0</td>\n",
       "      <td>4</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             Artist Name                             Track Name  Popularity  \\\n",
       "0             Bruno Mars  That's What I Like (feat. Gucci Mane)        60.0   \n",
       "1                 Boston                           Hitch a Ride        54.0   \n",
       "2          The Raincoats                     No Side to Fall In        35.0   \n",
       "3                   Deno             Lingo (feat. J.I & Chunkz)        66.0   \n",
       "4  Red Hot Chili Peppers      Nobody Weird Like Me - Remastered        53.0   \n",
       "\n",
       "   danceability  energy   key  loudness  mode  speechiness  acousticness  \\\n",
       "0         0.854   0.564   1.0    -4.964     1       0.0485      0.017100   \n",
       "1         0.382   0.814   3.0    -7.230     1       0.0406      0.001100   \n",
       "2         0.434   0.614   6.0    -8.334     1       0.0525      0.486000   \n",
       "3         0.853   0.597  10.0    -6.528     0       0.0555      0.021200   \n",
       "4         0.167   0.975   2.0    -4.279     1       0.2160      0.000169   \n",
       "\n",
       "   instrumentalness  liveness  valence    tempo  duration_in min/ms  \\\n",
       "0               NaN    0.0849   0.8990  134.071            234596.0   \n",
       "1          0.004010    0.1010   0.5690  116.454            251733.0   \n",
       "2          0.000196    0.3940   0.7870  147.681            109667.0   \n",
       "3               NaN    0.1220   0.5690  107.033            173968.0   \n",
       "4          0.016100    0.1720   0.0918  199.060            229960.0   \n",
       "\n",
       "   time_signature  Class  \n",
       "0               4      5  \n",
       "1               4     10  \n",
       "2               4      6  \n",
       "3               4      5  \n",
       "4               4     10  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_data = pd.read_csv('train.csv')\n",
    "print(f\"Dataset shape: {raw_data.shape}\")\n",
    "raw_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7.0     2097\n",
       "2.0     1994\n",
       "9.0     1961\n",
       "1.0     1680\n",
       "4.0     1549\n",
       "11.0    1476\n",
       "5.0     1397\n",
       "6.0     1190\n",
       "8.0     1119\n",
       "10.0    1010\n",
       "3.0      509\n",
       "Name: key, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_data['key'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10    4949\n",
       "6     2587\n",
       "9     2524\n",
       "8     1854\n",
       "5     1447\n",
       "1     1373\n",
       "2     1272\n",
       "0      625\n",
       "7      576\n",
       "3      402\n",
       "4      387\n",
       "Name: Class, dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_data['Class'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 17996 entries, 0 to 17995\n",
      "Data columns (total 17 columns):\n",
      " #   Column              Non-Null Count  Dtype  \n",
      "---  ------              --------------  -----  \n",
      " 0   Artist Name         17996 non-null  object \n",
      " 1   Track Name          17996 non-null  object \n",
      " 2   Popularity          17568 non-null  float64\n",
      " 3   danceability        17996 non-null  float64\n",
      " 4   energy              17996 non-null  float64\n",
      " 5   key                 15982 non-null  float64\n",
      " 6   loudness            17996 non-null  float64\n",
      " 7   mode                17996 non-null  int64  \n",
      " 8   speechiness         17996 non-null  float64\n",
      " 9   acousticness        17996 non-null  float64\n",
      " 10  instrumentalness    13619 non-null  float64\n",
      " 11  liveness            17996 non-null  float64\n",
      " 12  valence             17996 non-null  float64\n",
      " 13  tempo               17996 non-null  float64\n",
      " 14  duration_in min/ms  17996 non-null  float64\n",
      " 15  time_signature      17996 non-null  int64  \n",
      " 16  Class               17996 non-null  int64  \n",
      "dtypes: float64(12), int64(3), object(2)\n",
      "memory usage: 2.3+ MB\n"
     ]
    }
   ],
   "source": [
    "raw_data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Artist Name              0\n",
       "Track Name               0\n",
       "Popularity             428\n",
       "danceability             0\n",
       "energy                   0\n",
       "key                   2014\n",
       "loudness                 0\n",
       "mode                     0\n",
       "speechiness              0\n",
       "acousticness             0\n",
       "instrumentalness      4377\n",
       "liveness                 0\n",
       "valence                  0\n",
       "tempo                    0\n",
       "duration_in min/ms       0\n",
       "time_signature           0\n",
       "Class                    0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_data.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "#imput missing with zero\n",
    "data = raw_data.fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Artist Name           0\n",
       "Track Name            0\n",
       "Popularity            0\n",
       "danceability          0\n",
       "energy                0\n",
       "key                   0\n",
       "loudness              0\n",
       "mode                  0\n",
       "speechiness           0\n",
       "acousticness          0\n",
       "instrumentalness      0\n",
       "liveness              0\n",
       "valence               0\n",
       "tempo                 0\n",
       "duration_in min/ms    0\n",
       "time_signature        0\n",
       "Class                 0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "15129"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['Track Name'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_col = ['Artist Name','Track Name']\n",
    "\n",
    "for i in cat_col:\n",
    "    data[i] = data[i].astype('str') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train = data.drop('Class' ,axis = 1)\n",
    "y_train = data['Class']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# submission 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_data = pd.read_csv('test.csv')\n",
    "test_data = test_data.fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<catboost.core.CatBoostClassifier at 0x1e98c433430>"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# ideal parameters\n",
    "clf = CatBoostClassifier(loss_function='MultiClass',\n",
    "                         #eval_metric = 'Logloss',\n",
    "                         #learning_rate = 0.9,\n",
    "                         #l2_leaf_reg = 17,\n",
    "                         #rsm = 0.8,\n",
    "                         #scale_pos_weight = estimate,\n",
    "                         #custom_loss = 'F1',\n",
    "                         #iterations = 100,\n",
    "                         verbose=False\n",
    "                        )\n",
    "clf.fit(x_train,y_train , cat_features = cat_col)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred2 = clf.predict_proba(test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([6.69845542e-05, 2.11424139e-01, 6.35946207e-03, 1.33700685e-05,\n",
       "       2.02112236e-05, 4.81802338e-04, 2.42754112e-01, 1.46369373e-05,\n",
       "       1.61416865e-02, 1.62511262e-02, 5.06472470e-01])"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred2[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub_temp = pd.read_csv(\"submission.csv\")\n",
    "finalpreds = pd.DataFrame(y_pred2)\n",
    "finalpreds.columns = sub_temp.columns\n",
    "finalpreds.to_csv(\"submit.csv\",index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
